Using Standard Methods

source('misc_functions/functions.R')
set.seed(123)
library(mgcv)
dat = gamSim(1, n=400, dist="normal", scale=1, verbose=F)
mod = lm(y ~ x1 + x2, data=dat)
summary(mod)

Call:
lm(formula = y ~ x1 + x2, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.6634 -1.3220 -0.0571  1.5297  7.0937 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   7.2745     0.3332   21.83   <2e-16 ***
x1            6.3616     0.4352   14.62   <2e-16 ***
x2           -5.1214     0.4526  -11.31   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.532 on 397 degrees of freedom
Multiple R-squared:  0.4594,    Adjusted R-squared:  0.4567 
F-statistic: 168.7 on 2 and 397 DF,  p-value: < 2.2e-16
plot(mod, which = 1:2)

# requires broom and glue packages
broom::augment(mod) %>% 
  ggplot(aes(x=.fitted, y=y)) +
  geom_point(alpha=.25, color='#ff5500') + 
  geom_smooth(se=F, color='#00aaff') +
  annotate('text',
           label = glue::glue("Rsq = {round(summary(mod)$r.squared, 2)}"),
           x = 4,
           y = 16) +
  labs(title='Fitted vs. Observed') +
  theme_minimal()

dat %>% 
  select(x1, x2, y) %>% 
  gather(key=variable, value=Predictor, -y) %>% 
  ggplot(aes(x=Predictor, y=y)) +
  geom_point(alpha=.25, color='#ff5500') + 
  geom_smooth(aes(), color='#00aaff', se=F) +
  facet_grid(~variable) + 
  labs(title='Predictors vs. Y') +
  theme_trueMinimal()

Heteroscedasticity, non-normality, etc.

modlog = lm(log(y) ~ x1 + x2, dat)
summary(modlog)

Call:
lm(formula = log(y) ~ x1 + x2, data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-3.07226 -0.13090  0.05803  0.22780  0.79379 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.82673    0.05398  33.839   <2e-16 ***
x1           0.93691    0.07050  13.290   <2e-16 ***
x2          -0.68719    0.07333  -9.371   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4102 on 397 degrees of freedom
Multiple R-squared:  0.3968,    Adjusted R-squared:  0.3937 
F-statistic: 130.6 on 2 and 397 DF,  p-value: < 2.2e-16
plot(modlog, which = 1:2)


broom::augment(mod) %>% 
  ggplot(aes(x=.fitted, y=y)) +
  geom_point(alpha=.25, color='#ff5500') + 
  geom_smooth(se=F, color='#00aaff') +
  annotate('text',
           label = glue::glue("Rsq = {round(summary(modlog)$r.squared, 2)}"),
           x = 4,
           y = 16) +
  labs(title='Fitted vs. Observed') +
  theme_trueMinimal()

Polynomial Regression

set.seed(123)
x = runif(500)
mu = sin(2 * (4 * x - 2)) + 2 * exp(-(16 ^ 2) * ((x - .5) ^ 2))
y = rnorm(500, mu, .3)
d = data.frame(x,y) 

Polynomial regression is problematic

A standard linear regression is definitely not going to capture this relationship. As above, we could try and use polynomial regression here, e.g. fitting a quadratic or cubic function within the standard regression framework. However, this is unrealistic at best and at worst isn’t useful for complex relationships. In the following, even with a polynomial of degree 15 the fit is fairly poor in many areas, and ‘wiggles’ in some places where there doesn’t appear to be a need to.

library(plotly)  # install if you don't have

fits = sapply(seq(3,15, 3), function(p) fitted(lm(y~poly(x,p)))) %>% 
  data.frame(x, y, .) %>% 
  gather(key=polynomial, value=fits, -x, -y) %>% 
  mutate(polynomial = factor(polynomial, labels=seq(3,15, 3)))

plot_ly(data=d) %>% 
  add_markers(~x, ~y, marker=list(color='#ff5500', opacity=.2), showlegend=F) %>% 
  add_lines(~x, ~fits, color=~polynomial, data=fits) %>% 
  theme_plotly()

NA
LS0tCnRpdGxlOiAiVGhlIENhc2UgZm9yIEdBTXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawplZGl0b3Jfb3B0aW9uczogCiAgY2h1bmtfb3V0cHV0X3R5cGU6IGlubGluZQotLS0KCgojIyBVc2luZyBTdGFuZGFyZCBNZXRob2RzCgoKYGBge3IgbWlzY19mdW5jdGlvbnN9CnNvdXJjZSgnbWlzY19mdW5jdGlvbnMvZnVuY3Rpb25zLlInKQpgYGAKCgpgYGB7ciBzaW11bGF0ZWRfZGF0YX0Kc2V0LnNlZWQoMTIzKQpsaWJyYXJ5KG1nY3YpCmRhdCA9IGdhbVNpbSgxLCBuPTQwMCwgZGlzdD0ibm9ybWFsIiwgc2NhbGU9MSwgdmVyYm9zZT1GKQpgYGAKCmBgYHtyIGRlbW9sbX0KbW9kID0gbG0oeSB+IHgxICsgeDIsIGRhdGE9ZGF0KQpzdW1tYXJ5KG1vZCkKYGBgCmBgYHtyIGRpYWdub3N0aWNzfQpwbG90KG1vZCwgd2hpY2ggPSAxOjIpCmBgYAoKYGBge3IgZ2VuZXJhbF9maXR9CiMgcmVxdWlyZXMgYnJvb20gYW5kIGdsdWUgcGFja2FnZXMKYnJvb206OmF1Z21lbnQobW9kKSAlPiUgCiAgZ2dwbG90KGFlcyh4PS5maXR0ZWQsIHk9eSkpICsKICBnZW9tX3BvaW50KGFscGhhPS4yNSwgY29sb3I9JyNmZjU1MDAnKSArIAogIGdlb21fc21vb3RoKHNlPUYsIGNvbG9yPScjMDBhYWZmJykgKwogIGFubm90YXRlKCd0ZXh0JywKICAgICAgICAgICBsYWJlbCA9IGdsdWU6OmdsdWUoIlJzcSA9IHtyb3VuZChzdW1tYXJ5KG1vZCkkci5zcXVhcmVkLCAyKX0iKSwKICAgICAgICAgICB4ID0gNCwKICAgICAgICAgICB5ID0gMTYpICsKICBsYWJzKHRpdGxlPSdGaXR0ZWQgdnMuIE9ic2VydmVkJykgKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCmBgYHtyIHJlbGF0aW9uc2hpcHN9CmRhdCAlPiUgCiAgc2VsZWN0KHgxLCB4MiwgeSkgJT4lIAogIGdhdGhlcihrZXk9dmFyaWFibGUsIHZhbHVlPVByZWRpY3RvciwgLXkpICU+JSAKICBnZ3Bsb3QoYWVzKHg9UHJlZGljdG9yLCB5PXkpKSArCiAgZ2VvbV9wb2ludChhbHBoYT0uMjUsIGNvbG9yPScjZmY1NTAwJykgKyAKICBnZW9tX3Ntb290aChhZXMoKSwgY29sb3I9JyMwMGFhZmYnLCBzZT1GKSArCiAgZmFjZXRfZ3JpZCh+dmFyaWFibGUpICsgCiAgbGFicyh0aXRsZT0nUHJlZGljdG9ycyB2cy4gWScpICsKICB0aGVtZV90cnVlTWluaW1hbCgpCmBgYAoKIyMgSGV0ZXJvc2NlZGFzdGljaXR5LCBub24tbm9ybWFsaXR5LCBldGMuCgpgYGB7ciBsb2dfbm9faGVscH0KbW9kbG9nID0gbG0obG9nKHkpIH4geDEgKyB4MiwgZGF0KQpzdW1tYXJ5KG1vZGxvZykKcGxvdChtb2Rsb2csIHdoaWNoID0gMToyKQoKYnJvb206OmF1Z21lbnQobW9kKSAlPiUgCiAgZ2dwbG90KGFlcyh4PS5maXR0ZWQsIHk9eSkpICsKICBnZW9tX3BvaW50KGFscGhhPS4yNSwgY29sb3I9JyNmZjU1MDAnKSArIAogIGdlb21fc21vb3RoKHNlPUYsIGNvbG9yPScjMDBhYWZmJykgKwogIGFubm90YXRlKCd0ZXh0JywKICAgICAgICAgICBsYWJlbCA9IGdsdWU6OmdsdWUoIlJzcSA9IHtyb3VuZChzdW1tYXJ5KG1vZGxvZykkci5zcXVhcmVkLCAyKX0iKSwKICAgICAgICAgICB4ID0gNCwKICAgICAgICAgICB5ID0gMTYpICsKICBsYWJzKHRpdGxlPSdGaXR0ZWQgdnMuIE9ic2VydmVkJykgKwogIHRoZW1lX3RydWVNaW5pbWFsKCkKYGBgCgoKIyMgUG9seW5vbWlhbCBSZWdyZXNzaW9uCgpgYGB7ciBzaW1EYXRhfQpzZXQuc2VlZCgxMjMpCnggPSBydW5pZig1MDApCm11ID0gc2luKDIgKiAoNCAqIHggLSAyKSkgKyAyICogZXhwKC0oMTYgXiAyKSAqICgoeCAtIC41KSBeIDIpKQp5ID0gcm5vcm0oNTAwLCBtdSwgLjMpCmQgPSBkYXRhLmZyYW1lKHgseSkgCmBgYAoKCiMjIyMgUG9seW5vbWlhbCByZWdyZXNzaW9uIGlzIHByb2JsZW1hdGljCgpBIHN0YW5kYXJkIGxpbmVhciByZWdyZXNzaW9uIGlzIGRlZmluaXRlbHkgbm90IGdvaW5nIHRvIGNhcHR1cmUgdGhpcyByZWxhdGlvbnNoaXAuICBBcyBhYm92ZSwgd2UgY291bGQgdHJ5IGFuZCB1c2UgcG9seW5vbWlhbCByZWdyZXNzaW9uIGhlcmUsIGUuZy4gZml0dGluZyBhIHF1YWRyYXRpYyBvciBjdWJpYyBmdW5jdGlvbiB3aXRoaW4gdGhlIHN0YW5kYXJkIHJlZ3Jlc3Npb24gZnJhbWV3b3JrLiAgSG93ZXZlciwgdGhpcyBpcyB1bnJlYWxpc3RpYyBhdCBiZXN0IGFuZCBhdCB3b3JzdCBpc24ndCB1c2VmdWwgZm9yIGNvbXBsZXggcmVsYXRpb25zaGlwcy4gSW4gdGhlIGZvbGxvd2luZywgZXZlbiB3aXRoIGEgcG9seW5vbWlhbCBvZiBkZWdyZWUgMTUgdGhlIGZpdCBpcyBmYWlybHkgcG9vciBpbiBtYW55IGFyZWFzLCBhbmQgJ3dpZ2dsZXMnIGluIHNvbWUgcGxhY2VzIHdoZXJlIHRoZXJlIGRvZXNuJ3QgYXBwZWFyIHRvIGJlIGEgbmVlZCB0by4KCmBgYHtyIHNpbURhdGFQbG90LCBtZXNzYWdlPUZ9CmZpdHMgPSBzYXBwbHkoc2VxKDMsMTUsIDMpLCBmdW5jdGlvbihwKSBmaXR0ZWQobG0oeX5wb2x5KHgscCkpKSkgJT4lIAogIGRhdGEuZnJhbWUoeCwgeSwgLikgJT4lIAogIGdhdGhlcihrZXk9cG9seW5vbWlhbCwgdmFsdWU9Zml0cywgLXgsIC15KSAlPiUgCiAgbXV0YXRlKHBvbHlub21pYWwgPSBmYWN0b3IocG9seW5vbWlhbCwgbGFiZWxzPXNlcSgzLDE1LCAzKSkpCgpwbG90X2x5KGRhdGE9ZCkgJT4lIAogIGFkZF9tYXJrZXJzKH54LCB+eSwgbWFya2VyPWxpc3QoY29sb3I9JyNmZjU1MDAnLCBvcGFjaXR5PS4yKSwgc2hvd2xlZ2VuZD1GKSAlPiUgCiAgYWRkX2xpbmVzKH54LCB+Zml0cywgY29sb3I9fnBvbHlub21pYWwsIGRhdGE9Zml0cykgJT4lIAogIHRoZW1lX3Bsb3RseSgpCgpgYGAKCgoK